Introduction

Here is my analysis of data/Carseats_org.csv.

Configuration

library(leaps)
library(data.table)
library(stringr)
library(caret)
library(ggplot2)
library(DataExplorer)
library(dplyr)
library(ggExtra)
library(RColorBrewer)
library(plotly)
library(corrplot)
library(htmltools)
library(MASS)
library(GPArotation)
library(psych)
library(mgcv)
library(tidyverse)
library(mltools)
library(e1071)

System Information

Due to the large number of libraries in use I have provided system information.

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS/LAPACK: /anaconda3/lib/R/lib/libRblas.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] e1071_1.7-0.1         mltools_0.3.5         forcats_0.3.0        
##  [4] purrr_0.2.5           readr_1.3.1           tidyr_0.8.2          
##  [7] tibble_2.0.1          tidyverse_1.2.1       mgcv_1.8-26          
## [10] nlme_3.1-137          psych_1.8.12          GPArotation_2014.11-1
## [13] MASS_7.3-51.1         htmltools_0.3.6       corrplot_0.84        
## [16] plotly_4.8.0          RColorBrewer_1.1-2    ggExtra_0.8          
## [19] dplyr_0.8.0.1         DataExplorer_0.7.0    caret_6.0-81         
## [22] ggplot2_3.1.0         lattice_0.20-38       stringr_1.4.0        
## [25] data.table_1.12.0     leaps_3.0            
## 
## loaded via a namespace (and not attached):
##  [1] lubridate_1.7.4    httr_1.4.0         tools_3.5.1       
##  [4] backports_1.1.3    R6_2.4.0           rpart_4.1-13      
##  [7] lazyeval_0.2.1     colorspace_1.4-0   nnet_7.3-12       
## [10] withr_2.1.2        tidyselect_0.2.5   gridExtra_2.3     
## [13] mnormt_1.5-5       compiler_3.5.1     cli_1.0.1         
## [16] rvest_0.3.2        xml2_1.2.0         scales_1.0.0      
## [19] digest_0.6.18      foreign_0.8-71     rmarkdown_1.11    
## [22] pkgconfig_2.0.2    htmlwidgets_1.3    rlang_0.3.1       
## [25] readxl_1.3.0       rstudioapi_0.9.0   shiny_1.2.0       
## [28] generics_0.0.2     jsonlite_1.6       ModelMetrics_1.1.0
## [31] magrittr_1.5       Matrix_1.2-15      Rcpp_1.0.0        
## [34] munsell_0.5.0      stringi_1.3.1      yaml_2.2.0        
## [37] plyr_1.8.4         recipes_0.1.4      grid_3.5.1        
## [40] parallel_3.5.1     promises_1.0.1     crayon_1.3.4      
## [43] miniUI_0.1.1.1     haven_2.0.0        splines_3.5.1     
## [46] hms_0.4.2          knitr_1.21         pillar_1.3.1      
## [49] igraph_1.2.4       reshape2_1.4.3     codetools_0.2-16  
## [52] stats4_3.5.1       glue_1.3.0         evaluate_0.12     
## [55] modelr_0.1.3       httpuv_1.4.5.1     foreach_1.4.4     
## [58] networkD3_0.4      cellranger_1.1.0   gtable_0.2.0      
## [61] assertthat_0.2.0   xfun_0.4           gower_0.1.2       
## [64] mime_0.6           prodlim_2018.04.18 xtable_1.8-3      
## [67] broom_0.5.1        later_0.8.0        class_7.3-15      
## [70] survival_2.43-3    viridisLite_0.3.0  timeDate_3043.102 
## [73] iterators_1.0.10   lava_1.6.5         ipred_0.9-8
sapply(c('repr', 'IRdisplay', 'IRkernel'), function(p) paste(packageVersion(p)))
##      repr IRdisplay  IRkernel 
##  "0.19.2"   "0.7.0"  "0.8.15"

Data

I load the data into r, and drop the “ID” column.

carseats <- read.csv("data/Carseats_org.csv", header = T, stringsAsFactors = T)

drops <- c("X")
carseats <- carseats[ , !(names(carseats) %in% drops)]
head(carseats, 10)
##    Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 1   9.50       138     73          11        276   120       Bad  42
## 2  11.22       111     48          16        260    83      Good  65
## 3  10.06       113     35          10        269    80    Medium  59
## 4   7.40       117    100           4        466    97    Medium  55
## 5   4.15       141     64           3        340   128       Bad  38
## 6  10.81       124    113          13        501    72       Bad  78
## 7   6.63       115    105           0         45   108    Medium  71
## 8  11.85       136     81          15        425   120      Good  67
## 9   6.54       132    110           0        108   124    Medium  76
## 10  4.69       132    113           0        131   124    Medium  76
##    Education Urban  US
## 1         17   Yes Yes
## 2         10   Yes Yes
## 3         12   Yes Yes
## 4         14   Yes Yes
## 5         13   Yes  No
## 6         16    No Yes
## 7         15   Yes  No
## 8         10   Yes Yes
## 9         10    No  No
## 10        17    No Yes

Here I create two new data frames to manage numeric and categorical data.

# get vectors of continuous and categorical cols
nums <- dplyr::select_if(carseats, is.numeric)
cats <- dplyr::select_if(carseats, is.factor)

nums[sample(nrow(nums), 10), ]
##     Sales CompPrice Income Advertising Population Price Age Education
## 191  8.79       130     37          13        297   101  37        13
## 175  0.00       139     24           0        358   185  79        15
## 176  7.54       115     89           0         38   122  25        12
## 345  8.43       138     80           0        108   126  70        13
## 218  4.34       106     44           0        481   111  70        14
## 311  9.53       175     65          29        419   166  53        12
## 55   4.90       134    103          13         25   144  76        17
## 393  4.53       129     42          13        315   130  34        13
## 357  3.58       142    109           0        111   164  72        12
## 279  7.22       114    113           2        129   151  40        15
cats[sample(nrow(cats), 10), ]
##     ShelveLoc Urban  US
## 353      Good   Yes Yes
## 77     Medium   Yes Yes
## 287    Medium    No Yes
## 25        Bad   Yes Yes
## 123    Medium   Yes Yes
## 216       Bad   Yes Yes
## 42        Bad   Yes  No
## 180    Medium   Yes Yes
## 104       Bad   Yes Yes
## 397    Medium    No Yes

Numeric Summaries

summary(nums)
##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price            Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8   Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0   3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0   Max.   :80.00   Max.   :18.0
str(nums)
## 'data.frame':    400 obs. of  8 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : int  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : int  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: int  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : int  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : int  120 83 80 97 128 72 108 120 124 124 ...
##  $ Age        : int  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : int  17 10 12 14 13 16 15 10 10 17 ...

Categorical Summaries

summary(cats)
##   ShelveLoc   Urban       US     
##  Bad   : 96   No :118   No :142  
##  Good  : 85   Yes:282   Yes:258  
##  Medium:219
str(cats)
## 'data.frame':    400 obs. of  3 variables:
##  $ ShelveLoc: Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Urban    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US       : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

Data Dimensionality

This command is to inspect the different data types in the data.

str(carseats)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : int  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : int  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: int  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : int  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : int  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : int  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : int  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
print(paste('Number of Columns:', ncol(carseats)))
## [1] "Number of Columns: 11"
print(paste('Number of Numeric Columns:', ncol(nums)))
## [1] "Number of Numeric Columns: 8"
print(paste('Number of Categorical Columns:', ncol(cats)))
## [1] "Number of Categorical Columns: 3"
dim(carseats)
## [1] 400  11
dim(nums)
## [1] 400   8
dim(cats)
## [1] 400   3

Here’s a quick way to examine general properties of the data:

DataExplorer::introduce(data=carseats)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  400      11                3                  8                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0           400               4400        24000

Finally, I want to look at the first and last rows of the data set. Just to be safe:

head(carseats, 2)
##   Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 1  9.50       138     73          11        276   120       Bad  42
## 2 11.22       111     48          16        260    83      Good  65
##   Education Urban  US
## 1        17   Yes Yes
## 2        10   Yes Yes
tail(carseats, 2)
##     Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 399  5.94       100     79           7        284    95       Bad  50
## 400  9.71       134     37           0         27   120      Good  49
##     Education Urban  US
## 399        12   Yes Yes
## 400        16   Yes Yes
head(nums, 2)
##   Sales CompPrice Income Advertising Population Price Age Education
## 1  9.50       138     73          11        276   120  42        17
## 2 11.22       111     48          16        260    83  65        10
tail(nums, 2)
##     Sales CompPrice Income Advertising Population Price Age Education
## 399  5.94       100     79           7        284    95  50        12
## 400  9.71       134     37           0         27   120  49        16
head(cats, 2)
##   ShelveLoc Urban  US
## 1       Bad   Yes Yes
## 2      Good   Yes Yes
tail(cats, 2)
##     ShelveLoc Urban  US
## 399       Bad   Yes Yes
## 400      Good   Yes Yes

Numeric Plotting

I start out with a few general scatter plots.

plot_ly(data=carseats,
        x=~Age,
        y=~Sales,
        mode = 'markers',
        type = 'scatter',
        color=~ShelveLoc) %>%
            layout(title = "Age, Shelf Location, and Sales Scatter Plot", width=900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

This plot below shows good separation and a weak linear trend. These variables are worth investigating.

plot_ly(data=carseats,
        x=~Price,
        y=~Sales,
        mode = 'markers',
        type = 'scatter',
        color=~ShelveLoc) %>%
            layout(title = "Price, Shelf Location, and Sales Scatter Plot", width=900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

Here we inspect the density of the ‘Price vs Sales’ relationship:

plot_ly(data=carseats,
        x=~Price,
        y=~Sales,
        mode = 'markers',
        size = ~Price,
        type = 'scatter',
        colors = "Dark2",
        alpha = .6) %>%
            layout(title = "Price, US, and Sales Scatter Plot", width=900)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
## Warning: `line.width` does not currently support multiple values.

Normalization

I choose to normalize the numeric data in order to be able to plot each variable on the same scale. This will allow me to investigate the variation of each predictor relative to Sales.

preObj <- preProcess(nums, method=c("center", "scale"))
scaled.nums <- predict(preObj, nums)

head(scaled.nums, 2)
##       Sales  CompPrice     Income Advertising  Population     Price
## 1 0.7094877  0.8493913  0.1551667    0.656355  0.07572445  0.177601
## 2 1.3185281 -0.9113430 -0.7381360    1.408194 -0.03284107 -1.385119
##          Age Education
## 1 -0.6989069  1.182968
## 2  0.7208201 -1.488250
tail(scaled.nums, 2)
##          Sales  CompPrice     Income Advertising Population      Price
## 399 -0.5510841 -1.6286792  0.3695593  0.05488421  0.1300072 -0.8782909
## 400  0.7838473  0.5885417 -1.1311891 -0.99768973 -1.6138265  0.1776010
##            Age  Education
## 399 -0.2050888 -0.7250447
## 400 -0.2668161  0.8013652
str(scaled.nums)
## 'data.frame':    400 obs. of  8 variables:
##  $ Sales      : num  0.7095 1.3185 0.9078 -0.0341 -1.1849 ...
##  $ CompPrice  : num  0.849 -0.911 -0.781 -0.52 1.045 ...
##  $ Income     : num  0.155 -0.738 -1.203 1.12 -0.166 ...
##  $ Advertising: num  0.656 1.408 0.506 -0.396 -0.547 ...
##  $ Population : num  0.0757 -0.0328 0.0282 1.3649 0.51 ...
##  $ Price      : num  0.178 -1.385 -1.512 -0.794 0.515 ...
##  $ Age        : num  -0.699 0.721 0.35 0.104 -0.946 ...
##  $ Education  : num  1.183 -1.4882 -0.725 0.0382 -0.3434 ...
print("")
## [1] ""
summary(scaled.nums)
##      Sales            CompPrice            Income        
##  Min.   :-2.65440   Min.   :-3.12856   Min.   :-1.70290  
##  1st Qu.:-0.74584   1st Qu.:-0.65049   1st Qu.:-0.92573  
##  Median :-0.00224   Median : 0.00163   Median : 0.01224  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.64575   3rd Qu.: 0.65375   3rd Qu.: 0.79834  
##  Max.   : 3.10670   Max.   : 3.26225   Max.   : 1.83458  
##   Advertising        Population           Price         
##  Min.   :-0.9977   Min.   :-1.72918   Min.   :-3.87702  
##  1st Qu.:-0.9977   1st Qu.:-0.85387   1st Qu.:-0.66711  
##  Median :-0.2459   Median : 0.04858   Median : 0.05089  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.8067   3rd Qu.: 0.90693   3rd Qu.: 0.64219  
##  Max.   : 3.3630   Max.   : 1.65671   Max.   : 3.17633  
##       Age             Education       
##  Min.   :-1.74827   Min.   :-1.48825  
##  1st Qu.:-0.83779   1st Qu.:-0.72504  
##  Median : 0.07268   Median : 0.03816  
##  Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.78255   3rd Qu.: 0.80137  
##  Max.   : 1.64673   Max.   : 1.56457

Distributions

Here are scaled distributions (histograms and density plots) for each numeric variable, including Sales. The variables relating to money ($) tend to be approximately normal. Many other variables tend to be approximately uniform, which does not bode well for their predictive power.

scaled.nums %>%
    tidyr::gather() %>%
        ggplot(aes(x=value,y=..density..))+

            ggtitle('Distributions of Continous Variables (scaled)') +

            facet_wrap(~ key, scales = "free") +
            geom_histogram(fill=I("orange"), col=I("black"), bins = 50) +

            facet_wrap(~ key, scales = "free") +
            geom_density(color="blue", fill='light blue', alpha = 0.4)

Here we plot all numeric variables against their distributions. This is just another way to examine the information shown above.

scaled.nums %>%
    tidyr::gather() %>%
        plot_ly(x=~key, y=~value,
                type = "box",
                boxpoints = "all",
                jitter = 0.4,
                pointpos = 0,
                color = ~key,
                colors = "Dark2") %>%
                      subplot(shareX = TRUE)  %>%
                            layout(title = "Numeric Variable Distributions (scaled)",
                                  yaxis=list(title='Standard Deviation'),
                                  xaxis=list(title='Variable'),
                                  autosize=FALSE,
                                  width=900,
                                  height=500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

Scatterplots

Here we plot all numeric variables against Sales (scaled). This allows us to investigate possible linear relationships between that variable and Sales. As shown below, only ‘Price’ appears to have a linear relationship worth investigating. This took me so long to figure out.

numeric.scatterplots <- htmltools::tagList()
count = 1

for (i in names(scaled.nums[,-1])) {

  numeric.scatterplots[[count]] <- plot_ly(scaled.nums, x=scaled.nums[,i], y=scaled.nums$Sales,
                    colors = 'RdYlGn',
                    mode = 'markers',
                    type = 'scatter',
                    size = scaled.nums$Sales^2,
                    color = scaled.nums$Sales,
                    marker = list(line = list(color = 'black',width = 2)),
                    name=paste(i)) %>%
          layout(title = paste(i, "vs Sales (scaled)<br>Size=Sales^2"),
                               yaxis=list(title='Sales'),
                               xaxis=list(title=i),
                               showlegend = FALSE)

  count = count + 1

}

numeric.scatterplots
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

By adding naive regression lines to a few scatter plots we can confirm our suspicions:

fit.Pop <- lm(Sales ~ Population, data = scaled.nums)
fit.Age <- lm(Sales ~ Age, data = scaled.nums)
fit.CompPrice <- lm(Sales ~ CompPrice, data = scaled.nums)
fit.Price <- lm(Sales ~ Price, data = scaled.nums)

regression.scatterplots <- htmltools::tagList()

regression.scatterplots[[1]] <-  plot_ly(scaled.nums,
          x = ~Population,
          name = 'Population vs Sales Regression Line') %>%
              add_markers(y = ~Sales,
                    name = 'Population vs Sales Observations') %>%
                            add_lines(x = ~Population,
                                  y = fitted(fit.Pop)) %>%
                                        layout(title = "Population vs Sales",
                                               yaxis=list(title='Sales',
                                               xaxis=list(title='Population')),
                                               showlegend = FALSE)



regression.scatterplots[[2]] <-  plot_ly(scaled.nums,
          x = ~Age,
          name = 'Age vs Sales Regression Line') %>%
              add_markers(y = ~Sales,
                    name = 'Age vs Sales Observations') %>%
                            add_lines(x = ~Age,
                                  y = fitted(fit.Age)) %>%
                                        layout(title = "Age vs Sales",
                                               yaxis=list(title='Sales',
                                               xaxis=list(title='Age')),
                                               showlegend = FALSE)

regression.scatterplots[[3]] <-  plot_ly(scaled.nums,
          x = ~CompPrice,
          name = 'CompPrice vs Sales Regression Line') %>%
              add_markers(y = ~Sales,
                    name = 'CompPrice vs Sales Observations') %>%
                            add_lines(x = ~CompPrice,
                                  y = fitted(fit.CompPrice)) %>%
                                        layout(title = "CompPrice vs Sales",
                                               yaxis=list(title='Sales',
                                               xaxis=list(title='CompPrice')),
                                               showlegend = FALSE)

regression.scatterplots[[4]] <-  plot_ly(scaled.nums,
          x = ~Price,
          name = 'Price vs Sales Regression Line') %>%
              add_markers(y = ~Sales,
                    name = 'Price vs Sales Observations') %>%
                            add_lines(x = ~Price,
                                  y = fitted(fit.Price)) %>%
                                        layout(title = "Price vs Sales",
                                               yaxis=list(title='Sales',
                                               xaxis=list(title='Price')),
                                               showlegend = FALSE)


regression.scatterplots

Let’s compare the slopes:

scaled.nums %>%
    plot_ly(y = ~Sales) %>%
      add_lines(x= ~Population, y = fitted(fit.Pop),
                name = "fit.Pop slope", line = list(shape = "linear")) %>%
      add_lines(x= ~Age, y = fitted(fit.Age),
                name = "fit.Age slope", line = list(shape = "linear")) %>%
      add_lines(x= ~CompPrice, y = fitted(fit.CompPrice),
                name = "fit.CompPrice slope", line = list(shape = "linear")) %>%
      add_lines(x= ~Price, y = fitted(fit.Price),
                name = "fit.Price slope", line = list(shape = "linear")) %>%

    layout(title = "Regression Lines vs Sales (scaled)",
           autosize=FALSE,
           width=900,
           yaxis=list(title='Sales'),
           xaxis=list(title='Scaled Numeric Variable'))
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

Here’s a pretty graphic that doesn’t help me understand anything about the data.

y = scaled.nums$Sales
x = scaled.nums$Price

s <- subplot(
  plot_ly(x = x, color = I("black"), type = 'histogram'),
  plotly_empty(),
  plot_ly(x = x, y = y, type = 'histogram2dcontour', showscale = F),
  plot_ly(y = y, color = I("black"), type = 'histogram'),
  nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2),
  shareX = TRUE, shareY = TRUE, titleX = FALSE, titleY = FALSE)
## Warning: No trace type specified and no positional attributes specified
layout(s, showlegend = FALSE, autosize=FALSE,
           width=700,
           height=500,
           yaxis=list(title='Sales'),
           xaxis=list(title='Price'))
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()

Categorical Plotting

First, let’s create a data frame that we can use:

categorical.by.sales = cbind(Sales = scaled.nums$Sales, cats)
str(categorical.by.sales)
## 'data.frame':    400 obs. of  4 variables:
##  $ Sales    : num  0.7095 1.3185 0.9078 -0.0341 -1.1849 ...
##  $ ShelveLoc: Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Urban    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US       : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

Here we can see all categorical by Sales. We suspected that ‘ShelveLoc’ would be important based on one of the early scatter plots. It seems that this is the case.

categorical.boxplots <- htmltools::tagList()
count = 1

for (i in names(categorical.by.sales[,-1])) {

  categorical.boxplots[[count]] <- plot_ly(categorical.by.sales, x=categorical.by.sales[,i], y=categorical.by.sales$Sales,
                        type = "box",
                        boxpoints = "all",
                        jitter = .2,
                        pointpos = 0,
                        color =categorical.by.sales[,i],
                        colors='Set1',
                        name=paste(i)) %>%
                            layout(title = paste(i, "vs Sales (scaled)"),
                                  showlegend = TRUE,
                                  yaxis=list(title='Sales Standard Deviation'),
                                  xaxis=list(title=i))

  count = count + 1

}

categorical.boxplots

Here’s the same thing, but more musically:

categorical.violins <- htmltools::tagList()
count = 1

for (i in names(categorical.by.sales[,-1])) {

  categorical.violins[[count]] <- plot_ly(categorical.by.sales, x=categorical.by.sales[,i], y=categorical.by.sales$Sales,
                        split = categorical.by.sales[,i],
                        type = 'violin',
                        colors='Set1',
                        name=paste(i),
                        box = list(visible = TRUE),
                        meanline = list(visible = TRUE)) %>%
                            layout(xaxis = list(title = "US"),
                                yaxis = list(title = "Sales",zeroline = FALSE))


  count = count + 1

}

categorical.violins

Linear Regression

First, let’s merge the data set into a single data frame

scaled.merged <- cbind(categorical.by.sales[,-1], scaled.nums)
str(scaled.merged)
## 'data.frame':    400 obs. of  11 variables:
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
##  $ Sales      : num  0.7095 1.3185 0.9078 -0.0341 -1.1849 ...
##  $ CompPrice  : num  0.849 -0.911 -0.781 -0.52 1.045 ...
##  $ Income     : num  0.155 -0.738 -1.203 1.12 -0.166 ...
##  $ Advertising: num  0.656 1.408 0.506 -0.396 -0.547 ...
##  $ Population : num  0.0757 -0.0328 0.0282 1.3649 0.51 ...
##  $ Price      : num  0.178 -1.385 -1.512 -0.794 0.515 ...
##  $ Age        : num  -0.699 0.721 0.35 0.104 -0.946 ...
##  $ Education  : num  1.183 -1.4882 -0.725 0.0382 -0.3434 ...
head(nums, 2)
##   Sales CompPrice Income Advertising Population Price Age Education
## 1  9.50       138     73          11        276   120  42        17
## 2 11.22       111     48          16        260    83  65        10
tail(nums, 2)
##     Sales CompPrice Income Advertising Population Price Age Education
## 399  5.94       100     79           7        284    95  50        12
## 400  9.71       134     37           0         27   120  49        16
head(scaled.merged, 2)
##   ShelveLoc Urban  US     Sales  CompPrice     Income Advertising
## 1       Bad   Yes Yes 0.7094877  0.8493913  0.1551667    0.656355
## 2      Good   Yes Yes 1.3185281 -0.9113430 -0.7381360    1.408194
##    Population     Price        Age Education
## 1  0.07572445  0.177601 -0.6989069  1.182968
## 2 -0.03284107 -1.385119  0.7208201 -1.488250
tail(scaled.merged, 2)
##     ShelveLoc Urban  US      Sales  CompPrice     Income Advertising
## 399       Bad   Yes Yes -0.5510841 -1.6286792  0.3695593  0.05488421
## 400      Good   Yes Yes  0.7838473  0.5885417 -1.1311891 -0.99768973
##     Population      Price        Age  Education
## 399  0.1300072 -0.8782909 -0.2050888 -0.7250447
## 400 -1.6138265  0.1776010 -0.2668161  0.8013652

First, let’s look at some things that may give us trouble. Luckily it looks like the only serious correlation is with our dependent variable. We’ll want to watch the ‘Price’ vs ‘CompPrice’ relationship.

res <- cor(scaled.nums)

corrplot(res, type = "upper", order = "hclust",
         tl.col = "black", tl.srt = 45)

It appears that residuals are roughly symmetrical around 0. That’s strange. Mostly due to a relatively poor overall fit. Note how close to zero most of the coefficient estimates are.

simple.lm <- lm(Sales~., data=scaled.merged)
simple.summary <- summary(simple.lm)

print(simple.summary)
## 
## Call:
## lm(formula = Sales ~ ., data = scaled.merged)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01598 -0.24463  0.00748  0.23496  1.20797 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.73292    0.05999 -12.217  < 2e-16 ***
## ShelveLocGood    1.71742    0.05422  31.678  < 2e-16 ***
## ShelveLocMedium  0.69286    0.04465  15.516  < 2e-16 ***
## UrbanYes         0.04351    0.04000   1.088    0.277    
## USYes           -0.06519    0.05306  -1.229    0.220    
## CompPrice        0.50397    0.02252  22.378  < 2e-16 ***
## Income           0.15660    0.01828   8.565 2.58e-16 ***
## Advertising      0.28987    0.02619  11.066  < 2e-16 ***
## Population       0.01085    0.01933   0.561    0.575    
## Price           -0.79946    0.02239 -35.700  < 2e-16 ***
## Age             -0.26413    0.01825 -14.472  < 2e-16 ***
## Education       -0.01958    0.01830  -1.070    0.285    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3608 on 388 degrees of freedom
## Multiple R-squared:  0.8734, Adjusted R-squared:  0.8698 
## F-statistic: 243.4 on 11 and 388 DF,  p-value: < 2.2e-16

Linear Models and Subsets

Let’s do the same thing, but control the subsets using leaps

regfit.full=regsubsets(Sales~., data=scaled.merged, nvmax=5)
reg.summary=summary(regfit.full)

print(reg.summary)
## Subset selection object
## Call: (function(input_file, encoding) {
##     out_dir <- "docs"
##     rmarkdown::render(input_file, encoding = encoding, output_file = file.path(dirname(input_file), 
##         out_dir, "carseats.html"))
## })("/Users/milk/Desktop/git/EDA_r_ece579/carseats.Rmd", encoding = "UTF-8")
## 11 Variables  (and intercept)
##                 Forced in Forced out
## ShelveLocGood       FALSE      FALSE
## ShelveLocMedium     FALSE      FALSE
## UrbanYes            FALSE      FALSE
## USYes               FALSE      FALSE
## CompPrice           FALSE      FALSE
## Income              FALSE      FALSE
## Advertising         FALSE      FALSE
## Population          FALSE      FALSE
## Price               FALSE      FALSE
## Age                 FALSE      FALSE
## Education           FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
##          ShelveLocGood ShelveLocMedium UrbanYes USYes CompPrice Income
## 1  ( 1 ) "*"           " "             " "      " "   " "       " "   
## 2  ( 1 ) "*"           " "             " "      " "   " "       " "   
## 3  ( 1 ) "*"           " "             " "      " "   "*"       " "   
## 4  ( 1 ) "*"           " "             " "      " "   "*"       " "   
## 5  ( 1 ) "*"           "*"             " "      " "   "*"       " "   
##          Advertising Population Price Age Education
## 1  ( 1 ) " "         " "        " "   " " " "      
## 2  ( 1 ) " "         " "        "*"   " " " "      
## 3  ( 1 ) " "         " "        "*"   " " " "      
## 4  ( 1 ) "*"         " "        "*"   " " " "      
## 5  ( 1 ) "*"         " "        "*"   " " " "

We’ll just take code straight from the example on Canvas…

par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')

plot(regfit.full,scale="r2")

plot(regfit.full,scale="adjr2")

plot(regfit.full,scale="Cp")

plot(regfit.full,scale="bic")

Interaction Terms

Here we define a new model with some interaction terms: a. Income and Advertising b. Income and CompPrice c. Price and Age

interaction.lm <- lm(Sales~. + Income*Advertising + Income*CompPrice + Price*Age, data=scaled.merged)
interaction.summary <- summary(interaction.lm)

print(interaction.summary)
## 
## Call:
## lm(formula = Sales ~ . + Income * Advertising + Income * CompPrice + 
##     Price * Age, data = scaled.merged)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04967 -0.23955 -0.00936  0.24591  1.19774 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.730051   0.059236 -12.324  < 2e-16 ***
## ShelveLocGood       1.714324   0.053519  32.032  < 2e-16 ***
## ShelveLocMedium     0.680563   0.044177  15.405  < 2e-16 ***
## UrbanYes            0.045257   0.039380   1.149  0.25117    
## USYes              -0.069622   0.052329  -1.330  0.18415    
## CompPrice           0.507567   0.022131  22.935  < 2e-16 ***
## Income              0.152164   0.018025   8.442 6.43e-16 ***
## Advertising         0.289455   0.025756  11.238  < 2e-16 ***
## Population          0.007224   0.018985   0.381  0.70377    
## Price              -0.797242   0.022003 -36.233  < 2e-16 ***
## Age                -0.260783   0.017978 -14.505  < 2e-16 ***
## Education          -0.024248   0.018063  -1.342  0.18027    
## Income:Advertising  0.046378   0.018170   2.552  0.01108 *  
## CompPrice:Income   -0.059184   0.018911  -3.130  0.00188 ** 
## Price:Age           0.013171   0.017912   0.735  0.46259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3538 on 385 degrees of freedom
## Multiple R-squared:  0.8792, Adjusted R-squared:  0.8748 
## F-statistic: 200.1 on 14 and 385 DF,  p-value: < 2.2e-16
interaction.lm.subsets <- regsubsets(Sales~. + Income*Advertising + Income*CompPrice + Price*Age,
                                     data=scaled.merged, nvmax=5)

interaction.subsets.summary <- summary(interaction.lm.subsets)

print(interaction.subsets.summary)
## Subset selection object
## Call: (function(input_file, encoding) {
##     out_dir <- "docs"
##     rmarkdown::render(input_file, encoding = encoding, output_file = file.path(dirname(input_file), 
##         out_dir, "carseats.html"))
## })("/Users/milk/Desktop/git/EDA_r_ece579/carseats.Rmd", encoding = "UTF-8")
## 14 Variables  (and intercept)
##                    Forced in Forced out
## ShelveLocGood          FALSE      FALSE
## ShelveLocMedium        FALSE      FALSE
## UrbanYes               FALSE      FALSE
## USYes                  FALSE      FALSE
## CompPrice              FALSE      FALSE
## Income                 FALSE      FALSE
## Advertising            FALSE      FALSE
## Population             FALSE      FALSE
## Price                  FALSE      FALSE
## Age                    FALSE      FALSE
## Education              FALSE      FALSE
## Income:Advertising     FALSE      FALSE
## CompPrice:Income       FALSE      FALSE
## Price:Age              FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: exhaustive
##          ShelveLocGood ShelveLocMedium UrbanYes USYes CompPrice Income
## 1  ( 1 ) "*"           " "             " "      " "   " "       " "   
## 2  ( 1 ) "*"           " "             " "      " "   " "       " "   
## 3  ( 1 ) "*"           " "             " "      " "   "*"       " "   
## 4  ( 1 ) "*"           " "             " "      " "   "*"       " "   
## 5  ( 1 ) "*"           "*"             " "      " "   "*"       " "   
##          Advertising Population Price Age Education Income:Advertising
## 1  ( 1 ) " "         " "        " "   " " " "       " "               
## 2  ( 1 ) " "         " "        "*"   " " " "       " "               
## 3  ( 1 ) " "         " "        "*"   " " " "       " "               
## 4  ( 1 ) "*"         " "        "*"   " " " "       " "               
## 5  ( 1 ) "*"         " "        "*"   " " " "       " "               
##          CompPrice:Income Price:Age
## 1  ( 1 ) " "              " "      
## 2  ( 1 ) " "              " "      
## 3  ( 1 ) " "              " "      
## 4  ( 1 ) " "              " "      
## 5  ( 1 ) " "              " "

Variable Significance

Below we print the coefficients for the 5th model using the default model selection criteria. All coefficients are relatively small, as we would expect from the EDA above. This pretty much confirms what I would have guessed by looking at the data against sales. We still want to watch out for confounding between ‘Price’ and ‘CompPrice.’

coef(interaction.lm.subsets, 5)
##     (Intercept)   ShelveLocGood ShelveLocMedium       CompPrice 
##      -0.6995348       1.6746198       0.6277225       0.5090177 
##     Advertising           Price 
##       0.2832405      -0.7846476

Second Interaction Model

First, drop columns unneeded from analysis:

scaled.merged.slim <- scaled.merged[ , -which(names(scaled.merged) %in% c("US","Urban"))]

A few hyper parameters we’d like to be consistent for all models

nvmax <- 3

Forward Selection:

interaction.subset.fwd <- regsubsets(Sales~. + Income*Advertising + Income*CompPrice + Income*Age,
                                     data=scaled.merged.slim,
                                     nvmax = nvmax,
                                     method="forward")

fwd.subset.summary <- summary(interaction.subset.fwd)

coef(interaction.subset.fwd, 1:nvmax)
## [[1]]
##   (Intercept) ShelveLocGood 
##     -0.259671      1.221981 
## 
## [[2]]
##   (Intercept) ShelveLocGood         Price 
##    -0.2708256     1.2744733    -0.4688868 
## 
## [[3]]
##   (Intercept) ShelveLocGood     CompPrice         Price 
##    -0.2709362     1.2749938     0.4932455    -0.7573701

Backward Selection:

This is really strange. I can’t seem to find any documentation about this, but it appears that this model is actually ‘forward.’

interaction.subset.bk <- regsubsets(Sales~. + Income*Advertising + Income*CompPrice + Income*Age,
                                     data=scaled.merged.slim,
                                     nvmax = nvmax,
                                     method="backward")

bk.subset.summary <- summary(interaction.subset.bk)

coef(interaction.subset.bk, 1:nvmax)
## [[1]]
##   (Intercept) ShelveLocGood 
##     -0.259671      1.221981 
## 
## [[2]]
##   (Intercept) ShelveLocGood         Price 
##    -0.2708256     1.2744733    -0.4688868 
## 
## [[3]]
##   (Intercept) ShelveLocGood     CompPrice         Price 
##    -0.2709362     1.2749938     0.4932455    -0.7573701

Exhaustive

interaction.subset.ex <- regsubsets(Sales~. + Income*Advertising + Income*CompPrice + Income*Age,
                                     data=scaled.merged.slim,
                                     nvmax = nvmax,
                                     method="exhaustive")

ex.subset.summary <- summary(interaction.subset.ex)
coef(interaction.subset.ex, 1:nvmax)
## [[1]]
##   (Intercept) ShelveLocGood 
##     -0.259671      1.221981 
## 
## [[2]]
##   (Intercept) ShelveLocGood         Price 
##    -0.2708256     1.2744733    -0.4688868 
## 
## [[3]]
##   (Intercept) ShelveLocGood     CompPrice         Price 
##    -0.2709362     1.2749938     0.4932455    -0.7573701

This is a list that may come in handy.

model.list = list(list("Forward", interaction.subset.fwd, fwd.subset.summary),
                  list("Backward", interaction.subset.bk, bk.subset.summary),
                  list("Exhaustive", interaction.subset.ex, ex.subset.summary))

Evaluation Metric Plotting

It is interesting to see that each model selected the same variables, in the same order.

par(mfrow=c(2,2))
plot(fwd.subset.summary$rss,xlab="Number of Variables (forward)",ylab="RSS",type="l")
plot(fwd.subset.summary$adjr2,xlab="Number of Variables (forward)",ylab="Adjusted RSq",type="l")
plot(fwd.subset.summary$cp,xlab="Number of Variables (forward)",ylab="Cp",type='l')
plot(fwd.subset.summary$bic,xlab="Number of Variables (forward)",ylab="BIC",type='l')

par(mfrow=c(2,2))
plot(bk.subset.summary$rss,xlab="Number of Variables (backward)",ylab="RSS",type="l")
plot(bk.subset.summary$adjr2,xlab="Number of Variables (backward)",ylab="Adjusted RSq",type="l")
plot(bk.subset.summary$cp,xlab="Number of Variables (backward)",ylab="Cp",type='l')
plot(bk.subset.summary$bic,xlab="Number of Variables (backward)",ylab="BIC",type='l')

par(mfrow=c(2,2))
plot(ex.subset.summary$rss,xlab="Number of Variables (exhaustive)",ylab="RSS",type="l")
plot(ex.subset.summary$adjr2,xlab="Number of Variables (exhaustive)",ylab="Adjusted RSq",type="l")
plot(ex.subset.summary$cp,xlab="Number of Variables (exhaustive)",ylab="Cp",type='l')
plot(ex.subset.summary$bic,xlab="Number of Variables (exhaustive)",ylab="BIC",type='l')

Model Equations

Here we print the final equations for each model. Not, they are all the same.

for (mod.obj in model.list) {
  mod.name <- mod.obj[[1]]
  best.bic <- min(mod.obj[[3]]$bic)
  mod.num <- which.min(mod.obj[[3]]$bic)
  mod.cc <- coef(mod.obj[[2]], mod.num)

  mod.equation.format <- paste("Y =", paste(round(mod.cc[1],2),
                      paste(round(mod.cc[-1],2),
                      names(mod.cc[-1]),
                      sep=" * ", collapse=" + "),
                      sep=" + "), "+ e")

  print(paste("Model Selection Method: ",mod.name))
  print(paste("Min BIC:", best.bic))
  print(paste("Model Number: ", mod.num))
  print(paste("Model Equation: ", mod.equation.format))
  print("")

}
## [1] "Model Selection Method:  Forward"
## [1] "Min BIC: -373.710213587368"
## [1] "Model Number:  3"
## [1] "Model Equation:  Y = -0.27 + 1.27 * ShelveLocGood + 0.49 * CompPrice + -0.76 * Price + e"
## [1] ""
## [1] "Model Selection Method:  Backward"
## [1] "Min BIC: -373.710213587368"
## [1] "Model Number:  3"
## [1] "Model Equation:  Y = -0.27 + 1.27 * ShelveLocGood + 0.49 * CompPrice + -0.76 * Price + e"
## [1] ""
## [1] "Model Selection Method:  Exhaustive"
## [1] "Min BIC: -373.710213587368"
## [1] "Model Number:  3"
## [1] "Model Equation:  Y = -0.27 + 1.27 * ShelveLocGood + 0.49 * CompPrice + -0.76 * Price + e"
## [1] ""

Formally, we are looking for: \[\hat{Y} = \beta_0 + \beta_1X_1 + \beta_2X_2 + \beta_3X_3 + \epsilon\] Our model yields: \[\hat{Y} = -0.27 + 1.27(ShelveLocGood) + 0.49(CompPrice) + -0.76(Price)\]

Appendix A: PCA

This is what I would try (though, I think there is enough evidence to suggest that this data is synthetic.) PCA doesn’t help us very much, but we’ll see…

scaled.nums <- dplyr::select_if(scaled.merged, is.numeric)
scaled.nums <- scaled.nums[,-1]
pc_result = principal(scaled.nums, nfactors = 3)
print(pc_result$Vaccounted)
##                             RC1       RC2       RC3
## SS loadings           1.6233084 1.2805798 1.0666582
## Proportion Var        0.2319012 0.1829400 0.1523797
## Cumulative Var        0.2319012 0.4148412 0.5672209
## Proportion Explained  0.4088375 0.3225198 0.2686427
## Cumulative Proportion 0.4088375 0.7313573 1.0000000
print(pc_result$weights)
##                     RC1         RC2          RC3
## CompPrice    0.53157541 -0.06083110  0.030590603
## Income      -0.01204453 -0.14471355 -0.751358873
## Advertising  0.03963226  0.59649172  0.009275437
## Population  -0.02336339  0.63042187  0.026481975
## Price        0.53696973  0.04180948  0.010454833
## Age         -0.20779551 -0.03119157  0.245615463
## Education   -0.04934910 -0.08172391  0.567451434
fa.diagram(pc_result)

biplot(pc_result)

pca.df <- cbind(pc_result$scores, scaled.merged[1:4])
pca.df.h1 <- one_hot(as.data.table(pca.df))
head(pca.df.h1)
##           RC1        RC2        RC3 ShelveLoc_Bad ShelveLoc_Good
## 1:  0.6561080 0.29767325  0.4189620             1              0
## 2: -1.2390863 1.02275930 -0.1430276             0              1
## 3: -1.2300855 0.52627007  0.5440203             0              0
## 4: -0.7871945 0.45417596 -0.7861223             0              0
## 5:  1.0142239 0.03510677 -0.2563580             1              0
## 6: -1.4017045 1.16533839 -0.3315862             1              0
##    ShelveLoc_Medium Urban_No Urban_Yes US_No US_Yes       Sales
## 1:                0        0         1     0      1  0.70948774
## 2:                0        0         1     0      1  1.31852808
## 3:                1        0         1     0      1  0.90777994
## 4:                1        0         1     0      1 -0.03410803
## 5:                0        0         1     1      0 -1.18491100
## 6:                0        1         0     0      1  1.17334986

Empirically I tried a GAM model and a SVM. These weren’t impressive enough out of the box to show here. On to the next task…